# Dickstein, Ho, and Mark (2023)
# This script labels the households who are forced switchers.

# * # * # * # * # * # * #
# PRELIMINARIES         #
# * # * # * # * # * # * #

setwd("../library")
source("PreliminariesCode.R")

# * # * # * # * # * # * #
# LOADING DATA          #
# * # * # * # * # * # * #

# Load final exploded dataset
ExplodedDat <- fread("explodeddata.csv")

# Loading all subscribers in the individual market:
SubDat <- fread("finalsubsdata.csv")

# Loading Switcher Sample
switchers <- fread("forced_switchers.csv")

## subsetting
switchers <- switchers[!is.na(age)]

## Adding a "next year" variable
switchers[, year.lead := year + 1, ]

# * # * # * # * # * # * # * # * # * # * # * # * # * # * # * #
# FIND UNINSURED OBSERVATIONS THAT LOOK LIKE THE SWITCHERS  #
# * # * # * # * # * # * # * # * # * # * # * # * # * # * # * #

# Which switchers are uninsured?
UninSwitchers <- switchers[forced_out == 1 & tracked == 0]
## create new variables to match the household level dataset
UninSwitchers[, married := ifelse(nspouse > 0, T, F), ]
UninSwitchers[, withkids := ifelse(ndeps > 0, T, F), ]

# Normalizing (finding standard deviations from 0) ndeps, age, acg, and income:
normalize_vec = c("age", "ndeps", "sum_concurrent_risk", "best_guess_incomeoverFPL")
SubDat[, paste0(normalize_vec, "_norm") := lapply(.SD, function (x) x / sd(x)), .SDcols = normalize_vec ]
sd_vec <- unlist(SubDat[, lapply(.SD, sd), .SDcols = normalize_vec ])
for (i in 1:4){
  UninSwitchers[, paste0(normalize_vec[i], "_norm") := lapply(.SD, function (x) x / sd_vec[i]), .SDcols = normalize_vec[i] ]
}

# Create a household level dataset of the uninsured
UninDat <- SubDat[chosen_plan_id == "Outside_Option"]

# Defining distance function:
dista <- function(x, Y){ # x- ax1 vector, Y - data.table b rows x a cols
  Y_diff <- (t(as.matrix(Y)) - x)^2
  dist <- apply(Y_diff, 2, function(x) sqrt(sum(x)))
  return(dist)
}

# Conduct the matching provedure: for each uninsured switcher, match them to an uninsured household with similar characteristics.
UninDat_temp <- UninDat
UninDat_temp[, id := 1:nrow(UninDat_temp), ]
dist_min_vec <- numeric(nrow(UninSwitchers))
switcher_unin_subid_vec <- numeric(nrow(UninSwitchers))
switcher_unin_year_vec <- numeric(nrow(UninSwitchers))
for (i in 1:nrow(UninSwitchers)){
  foo <- UninSwitchers[i, ]
  ## Find all uninsured households that match according to rating area, marriage status, and kids status. 
  bar <- merge(UninDat_temp, foo[, .(year.lead, best_guess_ra, married, withkids), ],
    by.x = c("year", "best_guess_ra", "married"),
    by.y = c("year.lead", "best_guess_ra", "married"))
  if(nrow(bar) == 0){
    print(paste0("There are no observations in the uninsurance data that match switcher", i))
  } else {
  ## Of all these observations, find the one that is closest in terms of dependents, age, health status score, and income
  distances <- dista(
    unlist(foo[, paste0(normalize_vec, "_norm"), with = F]),
    bar[, paste0(normalize_vec, "_norm"), with = F])
  whichmin <- which.min(distances)
  dist_min_vec[i] <- min(distances)
  try(switcher_unin_subid_vec[i] <- bar$subscriberid[whichmin])
  try(switcher_unin_year_vec[i] <- bar$year[whichmin])
  if(length(bar$id[whichmin]) > 0){
    UninDat_temp <- UninDat_temp[id !=  bar$id[whichmin]]
  }
  }
  rm(list = c("whichmin", "foo", "bar"))
  if(i %in% c((1:20)*1000)){
    print(paste0("observation ", i, " complete"))
  }
}
switcheruninDat <- data.table(
  subscriberid = switcher_unin_subid_vec,
  year = switcher_unin_year_vec,
  distance = dist_min_vec)
switcheruninDatcondensed <- switcheruninDat[,
  .(N = .N), by = c("subscriberid", "year")]

# * # * # * # * # * # * # * # * # * #
# MAKING THE "SWITCHER" VARIABLE    #
# * # * # * # * # * # * # * # * # * #

# Collecting hhids of the matched households
hhidDat <- ExplodedDat[, .(subscriberid = mode1(subscriberid),
  uninsured = ifelse(mode1(chosen_plan_id) == "Outside_Option", 1, 0)),
  by = c("hhid", "year")]

# Finding the insured hhids:
insured_switcher_hhidDat <- merge(
  switchers[forced_out == 1 & in_sample.lead == 1, .(indsubscriberid.lead, year.lead), ], # switchers in ind. market
  hhidDat[uninsured == 0], # find their hhids
  by.x = c("indsubscriberid.lead", "year.lead"),
  by.y = c("subscriberid", "year"),
  all.x = T)

# Finding the uninsured hhids:
uninsured_switcher_hhidDat <- merge(
  switcheruninDatcondensed, # count of subid, year in switcher
  hhidDat[uninsured == 1],
  by = c("subscriberid", "year"))
uninsured_switcher_hhidDat <- uninsured_switcher_hhidDat[,.SD[sample(.N,min(.N,mode1(N)), replace = F)],by = c("subscriberid", "year")]

# Combining to get all the switcher's hhids:
switcher_hhid <- rbind(
  insured_switcher_hhidDat[, .(hhid, year = year.lead), ],
  uninsured_switcher_hhidDat,
  fill = T)
switcher_hhid <- switcher_hhid[, .(hhid, year, switcher = 1), ]
n_before_switcher <- nrow(switcher_hhid)
switcher_hhid <- unique(switcher_hhid) #Perhaps remove?
is.identified(switcher_hhid, c("hhid", "year"))

# Merging onto exploded data to create the switcher variable!
try(ExplodedDat[, switcher := NULL, ])
ExplodedDat <- merge(ExplodedDat,
  switcher_hhid,
  by = c("hhid", "year"),
  all.x = T)
ExplodedDat[, switcher := ifelse(is.na(switcher), 0, switcher), ]

# * # * # * #
# SAVING    #
# * # * # * #
fwrite(ExplodedDat, "explodeddata.csv")


